In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder,normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor  
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier,VotingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from category_encoders.binary import BinaryEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import r2_score
from xgboost import XGBClassifier
import warnings 
warnings.filterwarnings("ignore")
In [3]:
df= pd.read_csv("../data/processed/processed_heart_attack.csv")
df.head()
Out[3]:
Age Cholesterol Heart Rate Exercise Hours Per Week Diet Stress Level Sedentary Hours Per Day Income BMI Triglycerides Physical Activity Days Per Week Sleep Hours Per Day Heart Attack Risk Systolic
0 67 208 72 4.168189 1 9 6.615001 261404 31.251233 286 0 6 0 158
1 21 389 98 1.813242 0 1 4.963459 285768 27.194973 235 1 7 0 165
2 21 324 72 2.078353 2 9 9.463426 235282 28.176571 587 4 4 0 174
3 84 383 73 9.828130 1 9 7.648981 125640 36.464704 378 3 4 0 163
4 66 318 93 10.070897 0 6 1.514821 160555 21.809144 231 1 5 0 91
In [4]:
df.columns
Out[4]:
Index(['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Diet',
       'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI',
       'Triglycerides', 'Physical Activity Days Per Week',
       'Sleep Hours Per Day', 'Heart Attack Risk', 'Systolic'],
      dtype='object')

Split Data¶

In [5]:
x =df.drop('Heart Attack Risk',axis=1)
y  = df['Heart Attack Risk']
In [6]:
x_train ,x_test ,y_train ,y_test =train_test_split(x ,y ,random_state=0,test_size=0.2)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

HyperParameter Tunning¶

HyperParameter Tunning directly affects model performance.

==> each model has its hyperparameter and each parameter has number of different values so the question now which the best values of this parameter that achieve the best performance and accuracy.

HyperParameter Tunning such as : GridsearchCV and Randomized Search.

1) GridsearchCV: GridSearchCV exhaustively considers all parameter combinations.

GridsearchCV advantage: more accuracy than Randomized Search as in Gridsearch try all the possibilities in value range and runnig models depend on number of possiblilties.

GridsearchCV disadvantage: take alot of time and high cost.

The GridSearchCV instance implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.

2) Randomized Search: here try random values and learn random models.

Randomized Search advantage: cost and time redduction.

Randomized Search disadvantage: less accuracy than GridsearchCV .

GridSearch¶

HyperParameter Tunning of SVC¶

In [31]:
'''
HyperParameter Tunning (GridSearch)
1) For SVC
'''
model=SVC()
params = [
        {'C':[1, 10], 'kernel':['linear', 'sigmoid', 'poly'],'random_state':range(0,10)},
        {'C':[1, 10], 'kernel':['rbf'], 'gamma':[0.5, 0.6, 0.7, 0.1, 0.01, 0.01],'random_state':range(0,10)}
         ]
grid_search_svc=GridSearchCV(estimator=model,
                        param_grid=params,
                        scoring='recall',
                        n_jobs=-1)
grid_search_svc.fit(x_train,y_train)
Out[31]:
GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid=[{'C': [1, 10], 'kernel': ['linear', 'sigmoid', 'poly'],
                          'random_state': range(0, 10)},
                         {'C': [1, 10],
                          'gamma': [0.5, 0.6, 0.7, 0.1, 0.01, 0.01],
                          'kernel': ['rbf'], 'random_state': range(0, 10)}],
             scoring='recall')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid=[{'C': [1, 10], 'kernel': ['linear', 'sigmoid', 'poly'],
                          'random_state': range(0, 10)},
                         {'C': [1, 10],
                          'gamma': [0.5, 0.6, 0.7, 0.1, 0.01, 0.01],
                          'kernel': ['rbf'], 'random_state': range(0, 10)}],
             scoring='recall')
SVC()
SVC()
In [32]:
'''
best parms here is that when C:1 and kernel eq 'linear' and random_state eq  0 ==>default'
'''
grid_search_svc.best_params_
Out[32]:
{'C': 10, 'kernel': 'sigmoid', 'random_state': 0}
In [33]:
'''
here SVC score is 0.312

'''
grid_search_svc.best_score_
Out[33]:
0.312

HyperParameter Tunning of KNN¶

In [28]:
'''
HyperParameter Tunning (GridSearch)
2) For KNN
'''
knn_classifer=KNeighborsClassifier()
params = [{'n_neighbors': [3, 5, 7, 9],
         'weights': ['uniform', 'distance'],
         'algorithm':['ball_tree','kd_tree','brute'],
         'metric':['cityblock','cosine','euclidean','l1','l2','haversine','manhattan','nan_euclidean','minkowski'],
         'leaf_size': [15, 40]}]
grid_search_knn = GridSearchCV(knn_classifer,
                      param_grid=params,
                      scoring='recall')
grid_search_knn.fit(x_train, y_train)
Out[28]:
GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid=[{'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                          'leaf_size': [15, 40],
                          'metric': ['cityblock', 'cosine', 'euclidean', 'l1',
                                     'l2', 'haversine', 'manhattan',
                                     'nan_euclidean', 'minkowski'],
                          'n_neighbors': [3, 5, 7, 9],
                          'weights': ['uniform', 'distance']}],
             scoring='recall')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid=[{'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                          'leaf_size': [15, 40],
                          'metric': ['cityblock', 'cosine', 'euclidean', 'l1',
                                     'l2', 'haversine', 'manhattan',
                                     'nan_euclidean', 'minkowski'],
                          'n_neighbors': [3, 5, 7, 9],
                          'weights': ['uniform', 'distance']}],
             scoring='recall')
KNeighborsClassifier()
KNeighborsClassifier()
In [29]:
grid_search_knn.best_params_
Out[29]:
{'algorithm': 'brute',
 'leaf_size': 15,
 'metric': 'cosine',
 'n_neighbors': 3,
 'weights': 'distance'}
In [30]:
'''
here KNN score is 0.2988

'''
grid_search_knn.best_score_
Out[30]:
0.2988

HyperParameter Tunning of Random Forest Model¶

In [8]:
grid = [{'n_estimators':[100,300,500], 'max_depth':[None, 5,10,15],
        'min_samples_split':[2,5,10] , 'min_samples_leaf':[1,2,4]}]
In [9]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
In [10]:
grid_search = GridSearchCV(estimator = rfc, param_grid = grid, scoring = 'accuracy',n_jobs = -1,verbose = 2)
grid_search = grid_search.fit(x_train, y_train)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
In [11]:
grid_search_rfc =  GridSearchCV(rfc,
                      param_grid=grid,
                      scoring='recall')
In [12]:
grid_search.best_params_
Out[12]:
{'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}
In [14]:
grid_search.best_score_
Out[14]:
0.6433666191155493
In [267]:
'''
The best score from Hyper parameter tunning of Random forest model which is 0.643509272467903

'''
Out[267]:
'\nThe best score from Hyper parameter tunning of Random forest model which is 0.64365192582025698\n'

ROC¶

In [31]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba[:, 1])
pd.options.display.float_format = '{:,.10f}'.format
df_Recall_per = pd.DataFrame({'Threshold': thresholds, 'Precision': precisions[:-1], 'Recall': recalls[:-1]})
df_Recall_per.head()
Out[31]:
Threshold Precision Recall
0 0.1200000000 0.3645179692 1.0000000000
1 0.1400000000 0.3641552511 0.9984350548
2 0.1500000000 0.3643632210 0.9984350548
3 0.1600000000 0.3645714286 0.9984350548
4 0.1700000000 0.3647798742 0.9984350548

percision-Thershold Curve¶

In [36]:
import plotly.express as px
px.line(df_Recall_per, x='Threshold', y='Precision', title='Precision vs Threshold', width=800, height=600)

Recall-Threshold Curve¶

In [38]:
px.line(df_Recall_per, x='Threshold', y='Recall', title='Recall vs Threshold', width=800, height=600)

Precision- Recall Curve¶

In [39]:
px.line(df_Recall_per, x='Recall', y='Precision', title='Precision-Recall Curve', width=800, height=600,hover_data=['Threshold'])
In [47]:
# Find Threshold with Recall >= 0.6

threshold = df_Recall_per.loc[df_Recall_per['Recall'] >= 0.6, 'Threshold'].max()

print('Threshold: ', threshold)
print('Precision: ', df_Recall_per.loc[df_Recall_per['Threshold'] == threshold, 'Precision'].values[0])
print('Recall: ', df_Recall_per.loc[df_Recall_per['Threshold'] == threshold, 'Recall'].values[0])
Threshold:  0.35
Precision:  0.3717948717948718
Recall:  0.6353677621283255

ROC Curve¶

In [40]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_proba[:, 1])

df_ROC = pd.DataFrame({'Threshold': thresholds, 'FPR': fpr, 'TPR': tpr})
df_ROC.head(10)
Out[40]:
Threshold FPR TPR
0 inf 0.0000000000 0.0000000000
1 0.6200000000 0.0008976661 0.0000000000
2 0.6100000000 0.0017953321 0.0031298905
3 0.5900000000 0.0026929982 0.0031298905
4 0.5700000000 0.0044883303 0.0093896714
5 0.5600000000 0.0071813285 0.0125195618
6 0.5500000000 0.0116696589 0.0140845070
7 0.5400000000 0.0134649910 0.0172143975
8 0.5300000000 0.0179533214 0.0187793427
9 0.5200000000 0.0260323160 0.0250391236
In [42]:
px.line(df_ROC, x='FPR', y='TPR', title='ROC Curve', width=800, height=600, hover_data=['Threshold'])

ML Classifcation¶

In [15]:
models={
    'KNN':KNeighborsClassifier(n_neighbors=5),
    'SVC':SVC(C=10, kernel= 'sigmoid', random_state = 0),
    'RF':RandomForestClassifier(n_estimators=100,min_samples_split=2,min_samples_leaf=1,max_depth=5),
    'Bagging_classifier':BaggingClassifier(DecisionTreeClassifier(),n_estimators=5,n_jobs=-1),
    'voting': VotingClassifier(estimators=[('LR',LogisticRegression()),('NB',GaussianNB()),('DT',DecisionTreeClassifier())])
}
In [16]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,precision_score
import joblib
for name,model in models.items():
    print('--------- ',name,'-------------')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print('accuracy_training: ',accuracy_score(y_train,model.predict(x_train)))
    print('accuracy_testing: ',accuracy_score(y_pred,y_test))
    print('confusion matrix: ',confusion_matrix(y_test,y_pred))
    print('recall score: ',recall_score(y_test,y_pred,average='weighted'))
    print('precision score: ',precision_score(y_test,y_pred,average='weighted'))
    joblib.dump(model,name+'_model.h5')
    print('-'*30)

'''
the better result from classification is Random forest model.
'''
---------  KNN -------------
accuracy_training:  0.7174037089871612
accuracy_testing:  0.5556189389617798
confusion matrix:  [[824 290]
 [489 150]]
recall score:  0.5556189389617798
precision score:  0.5230772331259191
------------------------------
---------  SVC -------------
accuracy_training:  0.4864479315263909
accuracy_testing:  0.5464917284654878
confusion matrix:  [[738 376]
 [419 220]]
recall score:  0.5464917284654878
precision score:  0.539899972974961
------------------------------
---------  RF -------------
accuracy_training:  0.6433666191155493
accuracy_testing:  0.6354820308043354
confusion matrix:  [[1114    0]
 [ 639    0]]
recall score:  0.6354820308043354
precision score:  0.40383741147520236
------------------------------
---------  Bagging_classifier -------------
accuracy_training:  0.9504992867332382
accuracy_testing:  0.5681688533941814
confusion matrix:  [[825 289]
 [468 171]]
recall score:  0.5681688533941814
precision score:  0.5409755661476753
------------------------------
---------  voting -------------
accuracy_training:  0.6433666191155493
accuracy_testing:  0.6354820308043354
confusion matrix:  [[1114    0]
 [ 639    0]]
recall score:  0.6354820308043354
precision score:  0.40383741147520236
------------------------------
Out[16]:
'\nthe better result from classification is Random forest model.\n'
In [272]:
'''
make this command in order to get features that i will use after that in deployment.
'''
df.columns
Out[272]:
Index(['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Diet',
       'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI',
       'Triglycerides', 'Physical Activity Days Per Week',
       'Sleep Hours Per Day', 'Heart Attack Risk', 'Systolic'],
      dtype='object')
In [273]:
features = ['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Diet',
       'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI',
       'Triglycerides', 'Physical Activity Days Per Week',
       'Sleep Hours Per Day', 'Heart Attack Risk', 'Systolic']
In [274]:
'''
this to having feature and scaler that help me in deployment.
'''
joblib.dump(features,'features.h5')
joblib.dump(scaler,'scaler.h5')
Out[274]:
['scaler.h5']
In [275]:
df
Out[275]:
Age Cholesterol Heart Rate Exercise Hours Per Week Diet Stress Level Sedentary Hours Per Day Income BMI Triglycerides Physical Activity Days Per Week Sleep Hours Per Day Heart Attack Risk Systolic
0 67 208 72 4.168189 1 9 6.615001 261404 31.251233 286 0 6 0 158
1 21 389 98 1.813242 0 1 4.963459 285768 27.194973 235 1 7 0 165
2 21 324 72 2.078353 2 9 9.463426 235282 28.176571 587 4 4 0 174
3 84 383 73 9.828130 1 9 7.648981 125640 36.464704 378 3 4 0 163
4 66 318 93 10.070897 0 6 1.514821 160555 21.809144 231 1 5 0 91
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8758 60 121 61 7.917342 2 8 10.806373 235420 19.655895 67 7 7 0 94
8759 28 120 73 16.558426 2 8 3.833038 217881 23.993866 617 4 9 0 157
8760 47 250 105 3.148438 1 5 2.375214 36998 35.406146 527 4 4 1 161
8761 36 178 60 3.789950 0 5 0.029104 209943 27.294020 114 2 8 0 119
8762 25 356 75 18.081748 2 8 9.005234 247338 32.914151 180 7 4 1 138

8763 rows × 14 columns

summary ML¶

In [135]:
'''

- The result of Feature selection using both "high correlation filter" or "Random Forest Model"  as per below:
1) High correlation between Heart Attack Risk and 'Sedentary Hours Per Day' , 'BMI' , 'Exercise Hours Per Week' , 'Income' 
, 'Triglycerides' , 'Cholesterol' , 'Age' , 'Heart Rate' , 'Systolic' , 'Stress Level' , 'Physical Activity Days Per Week',
'Sleep Hours Per Day' , 'Diet' .


- GridsearchCV- HyperParmeter Tunning Result ==> The best alogorithm is Random Forest Model.

- The best classification model is  Random Forest Algorithm as it has more accuracy_training and accuracy_testing.


'''
Out[135]:
'\n\n- The result of Feature selection using both "high correlation filter" or "Random Forest Model"  as per below:\n1) High correlation between Heart Attack Risk and \'Sedentary Hours Per Day\' , \'BMI\' , \'Exercise Hours Per Week\' , \'Income\' \n, \'Triglycerides\' , \'Cholesterol\' , \'Age\' , \'Heart Rate\' , \'Systolic\' , \'Stress Level\' , \'Physical Activity Days Per Week\',\n\'Sleep Hours Per Day\' , \'Diet\' .\n\n\n- GridsearchCV- HyperParmeter Tunning Result ==> The best alogorithm is Random Forest Model.\n\n- The best classification model is  Random Forest Algorithm as it has more accuracy_training and accuracy_testing.\n\n\n'